{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 机器学习拟合Openrank指标"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Github: https://github.com/BhJia/Openrank-fitting  \n",
    "数据、使用方法教程及详细分析均在仓库中"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 引入相关包\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import os\n",
    "import json\n",
    "from tqdm import tqdm\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn import preprocessing\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import xgboost as xgb\n",
    "import lightgbm as lgb\n",
    "from sklearn.metrics import mean_squared_error,explained_variance_score,r2_score,mean_absolute_error\n",
    "np.set_printoptions(suppress=True)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "定义指标分类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "metrics_unprocessed=[\"active_dates_and_times\",\n",
    "                     \"issue_response_time\",\n",
    "                     \"issue_resolution_duration\",\n",
    "                     \"issue_age\",\n",
    "                     \"change_request_response_time\",\n",
    "                     \"change_request_resolution_duration\",\n",
    "                     \"change_request_age\"\n",
    "                     ]\n",
    "\n",
    "metrics_processed=[\"openrank\",\n",
    "                   \"technical_fork\",\n",
    "                   \"new_contributors\",\n",
    "                   \"inactive_contributors\",\n",
    "                   \"bus_factor\",\n",
    "                   \"issues_new\",\n",
    "                   \"issues_closed\",\n",
    "                   \"code_change_lines_add\",\n",
    "                   \"code_change_lines_remove\",\n",
    "                   \"code_change_lines_sum\",\n",
    "                   \"change_requests\",\n",
    "                   \"change_requests_accepted\",\n",
    "                   \"change_requests_reviews\"\n",
    "                  ]"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "定义数据预处理函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 输入数据路径\n",
    "data_path=\"./data/top_300_metrics/\"\n",
    "\n",
    "# 根据指标分类获取指标数据\n",
    "def get_value(file,metric):\n",
    "    with open(file) as f:\n",
    "        data = json.load(f)\n",
    "\n",
    "        # 如果指标是有统计信息的,取均值\n",
    "        if metric in metrics_unprocessed:\n",
    "            data=data['avg']\n",
    "        if \"2021-10-raw\" in data:\n",
    "            data.pop(\"2021-10-raw\")\n",
    "    f.close()\n",
    "    return data\n",
    "\n",
    "# 对时间进行编码\n",
    "def time_encoding(df):\n",
    "    df[\"time\"] = pd.to_datetime(df[\"time\"], format=\"%Y-%m\")\n",
    "    df[\"Time\"] = (df[\"time\"].dt.year - df[\"time\"].dt.year.min()) * 12 + df[\"time\"].dt.month\n",
    "    return df\n",
    "\n",
    "# 判断是否是CHAOSS指标\n",
    "def is_CHAOSS_metric(metric):\n",
    "    return (metric in metrics_unprocessed) or (metric in metrics_processed)\n",
    "\n",
    "# 初始化数据表\n",
    "def init_metric_table(project_path):\n",
    "    time=[]\n",
    "    active_dates_and_times=[]\n",
    "    \n",
    "    # 先选择active_dates_and_times这一个指标初始化数据表,之后再将其他指标数据加入表中\n",
    "    with open(os.path.join(project_path,metrics_unprocessed[0]+\".json\")) as f:\n",
    "        data=json.load(f)\n",
    "        for key,value in data.items():\n",
    "            time.append(key)\n",
    "            active_dates_and_times.append(np.average(np.array(value)))\n",
    "    metric_table=pd.DataFrame({\"time\":time,\"active_dates_and_times\":active_dates_and_times})\n",
    "    return metric_table\n",
    "\n",
    "# 获取整个数据表\n",
    "def get_project_metric_table(project_path):\n",
    "    metrics_table=init_metric_table(project_path)\n",
    "    for file in os.listdir(project_path):\n",
    "        file_path=os.path.join(project_path,file)\n",
    "        metric=file[:-5]\n",
    "        if metric==metrics_unprocessed[0] or not is_CHAOSS_metric(metric):\n",
    "            continue\n",
    "\n",
    "        # 获取指标数据\n",
    "        data=get_value(file_path,metric)\n",
    "\n",
    "        # 按照时间顺序排列,并填充缺失值\n",
    "        metrics_table[metric]=metrics_table[\"time\"].map(data).fillna(0)\n",
    "    return metrics_table"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "读取数据并进行数据预处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "metric_table=[]     # 数据表\n",
    "\n",
    "# 对每个组织\n",
    "for organization in tqdm(os.listdir(data_path)):\n",
    "    org_path=os.path.join(data_path,organization)\n",
    "\n",
    "    # 对组织的每个项目\n",
    "    for project in os.listdir(org_path):\n",
    "        project_path=os.path.join(org_path,project)\n",
    "        proj_metric_table=get_project_metric_table(project_path)\n",
    "        metric_table.append(proj_metric_table)\n",
    "        # corr_proj=proj_metric_table.corr()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "进一步处理并划分训练集验证集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 合并所有项目的数据\n",
    "data=pd.concat(metric_table).fillna(0)\n",
    "\n",
    "# 重新设置索引\n",
    "data = data.reset_index(drop=True)\n",
    "\n",
    "# 将时间编码\n",
    "data=time_encoding(data)\n",
    "Data=data.drop(columns=[\"time\"])\n",
    "\n",
    "# 划分训练集验证集\n",
    "test_data=data[data['time']>=\"2023-01\"]\n",
    "train_data=data[data['time']<\"2023-01\"]\n",
    "X_train=train_data.drop(columns=[\"openrank\",\"time\"])\n",
    "y_train=train_data[\"openrank\"]\n",
    "X_test=test_data.drop(columns=[\"openrank\",\"time\"])\n",
    "y_test=test_data[\"openrank\"]\n",
    "\n",
    "print(\"Data Shape:{}, Train data shape:{}, Test data Shape:{}\".format(Data.shape,X_train.shape,X_test.shape))"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "查看数据所有特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Data.columns"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "查看时间编码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Data['Time']"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "定义一些绘图函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 分布图\n",
    "def displot(x):\n",
    "    fig = plt.figure(dpi=150)\n",
    "    sns.distplot(x)\n",
    "\n",
    "# 另一种分布图\n",
    "def hisplot(x):\n",
    "    fig = plt.figure(dpi=150)\n",
    "    sns.histplot(x)\n",
    "\n",
    "# 相关性热力图\n",
    "def heatmap(X):\n",
    "    f,ax = plt.subplots(figsize=(15, 15))\n",
    "    sns.heatmap(X.corr(), annot=True, linecolor='white',linewidths=0.1,cmap=\"RdBu\", fmt= '.1f',ax=ax)\n",
    "\n",
    "# 直方分布图\n",
    "def hist(Data):\n",
    "    Data.hist(bins=30, figsize=(20,15),color='#A50021')\n",
    "\n",
    "# 双变量的相关性图\n",
    "def jointplot(x,y,data):\n",
    "    fig = plt.figure(dpi=150)\n",
    "    sns.jointplot(x=x, y=y, data=data, kind=\"hex\",color=\"#A50021\",ratio=8, space=0, height=8, marginal_kws={'bins':10,'kde':True})\n",
    "    plt.xlabel(x, fontsize=15)\n",
    "    plt.ylabel(y, fontsize=15)\n",
    "    plt.show()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "待拟合变量OpenRank值的分布情况"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Data['openrank'].describe()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "两种分布图"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "displot(Data['openrank'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "hisplot(Data['openrank'])"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "所有特征的分布直方图"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "hist(Data)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "相关性热力图"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "heatmap(Data)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "openrank和其他变量的相关性表（降序）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Data.corr()[\"openrank\"].sort_values(ascending=False)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "输入变量与目标变量之间的关系\n",
    "1. 时序数据和OpenRank值的关系"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "jointplot(\"Time\",\"openrank\",Data.iloc[::10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "jointplot(\"issue_response_time\",\"openrank\",Data[::20])"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "多重共线性判断"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 计算X'X特征值\n",
    "eigenvalues=np.linalg.eigvals(Data.T @ Data)\n",
    "\n",
    "# 计算条件数\n",
    "Condition_Number = np.sqrt(np.abs(np.max(eigenvalues)/np.min(eigenvalues)))\n",
    "\n",
    "print(\"eigenvalues:{}\".format(eigenvalues))\n",
    "print(\"Condition_Number:{}\".format(Condition_Number))"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "滞后特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 添加一阶滞后特征\n",
    "def add_lag_feature(X,lag):\n",
    "    lag=1\n",
    "    for column in X.columns:\n",
    "        X[column + \"_Lag\"] = X[column].shift(lag).fillna(0)\n",
    "    return X\n",
    "\n",
    "X_train=add_lag_feature(X_train,lag=1)\n",
    "X_test=add_lag_feature(X_test,lag=1)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "查看issues_new和其滞后特征issues_new_Lag"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train[[\"issues_new\", \"issues_new_Lag\"]]"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "定义XGBoost模型与训练验证过程"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 定义XGBoost回归模型\n",
    "def train_xgboost(X_train,X_test,y_train,y_test,epoch):\n",
    "\n",
    "    # 模型建立\n",
    "    model = xgb.XGBRegressor()\n",
    "\n",
    "    # 参数设置\n",
    "    params = {'colsample_bytree': 1,\n",
    "              'colsample_bylevel': 1,\n",
    "              'learning_rate': 0.06,\n",
    "              'max_depth': 9, \n",
    "              'alpha': 10,\n",
    "              'subsample':1,\n",
    "              'min_child_weight':4,\n",
    "              'gamma':0.2,\n",
    "              'reg_alpha':0.1,\n",
    "              'reg_lambda':0.3,\n",
    "              'scale_pos_weight':1}\n",
    "\n",
    "    # 交叉验证\n",
    "    dtrain = xgb.DMatrix(X_train, label=y_train)\n",
    "    cv_results = xgb.cv(params, dtrain, num_boost_round=epoch, nfold=5, metrics='rmse', early_stopping_rounds=10)\n",
    "\n",
    "    # 获取最佳迭代轮数,训练模型\n",
    "    best_num_boost_rounds = cv_results.shape[0]  # 最佳的迭代轮数\n",
    "    best_model = xgb.train(params, dtrain, num_boost_round=best_num_boost_rounds)\n",
    "\n",
    "    # 将测试集数据转换为DMatrix格式\n",
    "    dtest = xgb.DMatrix(X_test)\n",
    "\n",
    "    # 使用最佳模型进行预测\n",
    "    y_pred = best_model.predict(dtest)\n",
    "\n",
    "    # 评估\n",
    "    mse = mean_squared_error(y_test, y_pred)\n",
    "    rmse=np.sqrt(mse)\n",
    "    score=explained_variance_score(y_test, y_pred)\n",
    "    r2 = r2_score(y_test, y_pred)\n",
    "    mae = mean_absolute_error(y_test, y_pred)\n",
    "\n",
    "    # 打印评估结果\n",
    "    print(\"RMSE:{}\".format(rmse))\n",
    "    print(\"MAE:{}\".format(mae))\n",
    "    print(\"explained_variance_score:{}\".format(score))\n",
    "    print(\"R2 score:{}\".format(r2))\n",
    "\n",
    "    return best_model,cv_results"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "进行训练与测试"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "best_model_xgb,cv_results=train_xgboost(X_train,X_test,y_train,y_test,epoch=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(cv_results)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "绘制特征重要性图"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 获取特征重要性\n",
    "plt.figure(dpi=200)\n",
    "plt.rcParams['figure.dpi'] = 200\n",
    "fig, ax = plt.subplots(figsize=(10, 8))\n",
    "\n",
    "# 绘制特征重要性\n",
    "xgb.plot_importance(best_model_xgb, ax=ax)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 降序查看特征重要性\n",
    "xgb_feature_importance=best_model_xgb.get_score(importance_type='weight')\n",
    "sorted_importance = {feature: importance for feature, importance in sorted(xgb_feature_importance.items(), key=lambda x: x[1], reverse=True)}\n",
    "for feature, importance in sorted_importance.items():\n",
    "    print(f'{feature}: {importance}')"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "SHAP机器学习可解释性"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导入并初始化shap\n",
    "import shap\n",
    "shap.initjs()\n",
    "\n",
    "# shap解释器\n",
    "explainer_xgb = shap.TreeExplainer(best_model_xgb)\n",
    "\n",
    "# shap value\n",
    "shap_values_xgb = explainer_xgb(X_train)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "绘制两种summary_plot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(20, 15))\n",
    "shap.summary_plot(shap_values_xgb, X_train, plot_type=\"bar\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(20, 15))\n",
    "shap.summary_plot(shap_values_xgb, X_train)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "绘制单个样本每个特征的SHAP value贡献度(force_plot和waterfall)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "shap.force_plot(explainer_xgb.expected_value, shap_values_xgb.values[20000])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "shap.plots.waterfall(shap_values_xgb[20000])"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 降维：特征选择"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 要剔除的特征\n",
    "remove_columns=[\"Time_Lag\",\n",
    "                \"inactive_contributors_Lag\",\n",
    "                \"new_contributors_Lag\",\n",
    "                \"change_requests_accepted_Lag\",\n",
    "                \"issue_age_Lag\",\n",
    "                \"new_contributors\",\n",
    "                \"change_requests_accepted\",\n",
    "                \"code_change_lines_add_Lag\",\n",
    "                \"code_change_lines_remove\",\n",
    "                \"code_change_lines_remove_Lag\",\n",
    "                \"code_change_lines_sum_Lag\",\n",
    "                \"code_change_lines_sum\",\n",
    "                \"change_request_resolution_duration_Lag\"\n",
    "                ]\n",
    "\n",
    "X_train_reduce=X_train.drop(columns=remove_columns)\n",
    "X_test_reduce=X_test.drop(columns=remove_columns)\n",
    "\n",
    "best_model=train_xgboost(X_train_reduce,X_test_reduce,y_train,y_test,epoch=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 要剔除的特征\n",
    "remove_columns=[\"Time_Lag\",\n",
    "                \"inactive_contributors_Lag\",\n",
    "                \"new_contributors_Lag\",\n",
    "                \"change_requests_accepted_Lag\",\n",
    "                \"issue_age_Lag\",\n",
    "                \"new_contributors\",\n",
    "                \"change_requests_accepted\",\n",
    "                \"code_change_lines_add_Lag\",\n",
    "                \"code_change_lines_remove\",\n",
    "                \"code_change_lines_remove_Lag\",\n",
    "                \"code_change_lines_sum_Lag\",\n",
    "                \"code_change_lines_sum\",\n",
    "                \"change_request_resolution_duration_Lag\",\n",
    "                \"technical_fork\"\n",
    "                ]\n",
    "\n",
    "X_train_reduce=X_train.drop(columns=remove_columns)\n",
    "X_test_reduce=X_test.drop(columns=remove_columns)\n",
    "\n",
    "best_model=train_xgboost(X_train_reduce,X_test_reduce,y_train,y_test,epoch=1000)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "根据openrank阈值确定项目阶段"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 分位数\n",
    "quantiles = y_train.describe().loc[['25%', '50%', '75%']]\n",
    "\n",
    "# 四阶段划分\n",
    "def Stage4(openrank):\n",
    "    q25,q50,q75=quantiles\n",
    "    if openrank<=q25:\n",
    "        stage=1\n",
    "    elif openrank<=q50:\n",
    "        stage=2\n",
    "    elif openrank<=q75:\n",
    "        stage=3\n",
    "    else:\n",
    "        stage=4\n",
    "    return stage\n",
    "\n",
    "# 两阶段划分\n",
    "def Stage2(openrank):\n",
    "    q25,q50,q75=quantiles\n",
    "    if openrank<=q50:\n",
    "        stage=1\n",
    "    else:\n",
    "        stage=2\n",
    "\n",
    "    return stage\n",
    "\n",
    "# 根据划分函数与阶段数划分数据\n",
    "def get_train_test_data(data,Stage,stage):\n",
    "    lag=1\n",
    "\n",
    "    # 获取划分函数和阶段数对应的数据\n",
    "    data_stage=data[data[\"openrank\"].apply(Stage) == stage]\n",
    "\n",
    "    # 划分训练集与验证集\n",
    "    test_data_stage=data_stage[data_stage['time']>=\"2023-01\"]\n",
    "    train_data_stage=data_stage[data_stage['time']<\"2023-01\"]\n",
    "    X_train_stage=train_data_stage.drop(columns=[\"openrank\",\"time\"]).fillna(0)\n",
    "    y_train_stage=train_data_stage[\"openrank\"]\n",
    "    X_test_stage=test_data_stage.drop(columns=[\"openrank\",\"time\"]).fillna(0)\n",
    "    y_test_stage=test_data_stage[\"openrank\"]\n",
    "\n",
    "    # 滞后特征\n",
    "    for column in X_train_stage.columns:\n",
    "        X_train_stage[column + \"_Lag\"] = X_train_stage[column].shift(lag).fillna(0)\n",
    "        X_test_stage[column + \"_Lag\"] = X_test_stage[column].shift(lag).fillna(0)\n",
    "\n",
    "    # 降维\n",
    "    X_train_stage=X_train_stage.drop(columns=remove_columns)\n",
    "    X_test_stage=X_test_stage.drop(columns=remove_columns)\n",
    "    return X_train_stage,X_test_stage,y_train_stage,y_test_stage\n",
    "    "
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "两阶段数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train1,X_test1,y_train1,y_test1=get_train_test_data(data,Stage2,stage=1)\n",
    "X_train2,X_test2,y_train2,y_test2=get_train_test_data(data,Stage2,stage=2)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "两阶段模型训练验证"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "best_model1,cv_results=train_xgboost(X_train1,X_test1,y_train1,y_test1,epoch=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "best_model2,cv_results=train_xgboost(X_train2,X_test2,y_train2,y_test2,epoch=1000)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "四阶段数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train1,X_test1,y_train1,y_test1=get_train_test_data(data,Stage4,stage=1)\n",
    "X_train2,X_test2,y_train2,y_test2=get_train_test_data(data,Stage4,stage=2)\n",
    "X_train3,X_test3,y_train3,y_test3=get_train_test_data(data,Stage4,stage=3)\n",
    "X_train4,X_test4,y_train4,y_test4=get_train_test_data(data,Stage4,stage=4)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "四阶段模型训练验证"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "best_model1,cv_results=train_xgboost(X_train1,X_test1,y_train1,y_test1,epoch=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "best_model2,cv_results=train_xgboost(X_train2,X_test2,y_train2,y_test2,epoch=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "best_model3,cv_results=train_xgboost(X_train3,X_test3,y_train3,y_test3,epoch=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "best_model4,cv_results=train_xgboost(X_train4,X_test4,y_train4,y_test4,epoch=1000)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "定义LightGBM模型与训练验证过程"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def train_lightgbm(X_train, X_test, y_train, y_test,epoch):\n",
    "    # 定义LightGBM回归模型\n",
    "    model = lgb.LGBMRegressor(objective='regression')\n",
    "\n",
    "    # 参数设置\n",
    "    params = {'colsample_bytree': 1,\n",
    "            'learning_rate': 0.06,\n",
    "            'max_depth': 9, \n",
    "            'alpha': 10,\n",
    "            'subsample':1,\n",
    "            'min_child_weight':4,\n",
    "            'reg_alpha':0.1,\n",
    "            'reg_lambda':0.3,\n",
    "            'scale_pos_weight':1,\n",
    "            \"verbose\": -1\n",
    "            }\n",
    "\n",
    "    # 创建LightGBM的数据对象\n",
    "    dtrain = lgb.Dataset(X_train, label=y_train)\n",
    "\n",
    "    # 交叉验证\n",
    "    cv_results = lgb.cv(params, dtrain, num_boost_round=epoch, nfold=5, metrics='rmse', stratified=False)\n",
    "\n",
    "    # 使用最佳迭代轮数训练模型\n",
    "    best_num_boost_rounds = len(cv_results['valid rmse-mean'])\n",
    "    best_model = lgb.train(params, dtrain, num_boost_round=best_num_boost_rounds)\n",
    "\n",
    "    # 使用最佳模型进行预测\n",
    "    y_pred = best_model.predict(X_test)\n",
    "\n",
    "    # 评估\n",
    "    mse = mean_squared_error(y_test, y_pred)\n",
    "    rmse = np.sqrt(mse)\n",
    "    score = explained_variance_score(y_test, y_pred)\n",
    "    r2 = r2_score(y_test, y_pred)\n",
    "    mae = mean_absolute_error(y_test, y_pred)\n",
    "\n",
    "    # 打印评估结果\n",
    "    print(\"RMSE: {}\".format(rmse))\n",
    "    print(\"MAE: {}\".format(mae))\n",
    "    print(\"explained_variance_score: {}\".format(score))\n",
    "    print(\"R2 score: {}\".format(r2))\n",
    "\n",
    "    return best_model,cv_results\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "进行训练与测试"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "best_model_lgb,cv_results=train_lightgbm(X_train,X_test,y_train,y_test,epoch=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cv_results"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "绘制特征重要性图"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 获取特征重要性\n",
    "plt.figure(dpi=200)\n",
    "plt.rcParams['figure.dpi'] = 200\n",
    "fig, ax = plt.subplots(figsize=(10, 8))\n",
    "\n",
    "# 绘制特征重要性\n",
    "lgb.plot_importance(best_model_lgb, ax=ax)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 降序查看特征重要性\n",
    "lgb_feature_importance=best_model_lgb.feature_importance()\n",
    "lgb_feature_name = best_model_lgb.feature_name()\n",
    "sorted_importance = sorted(zip(lgb_feature_name, lgb_feature_importance), key=lambda x: x[1], reverse=True)\n",
    "\n",
    "for name, importance in sorted_importance:\n",
    "    print(f'{name}: {importance}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 合并特征重要性并取平均\n",
    "combined_importance = {}\n",
    "for feature, importance in dict(sorted_importance).items():\n",
    "    combined_importance[feature] = (importance + xgb_feature_importance.get(feature, 0)) / 2\n",
    "\n",
    "# 按照特征重要性降序排序\n",
    "sorted_importance_combined = sorted(combined_importance.items(), key=lambda x: x[1], reverse=True)\n",
    "\n",
    "# 查看特征重要性\n",
    "for feature, importance in sorted_importance_combined:\n",
    "    print(f'{feature}: {importance}')"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "绘制平均后的特征重要性"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 提取特征名称和对应的重要性值\n",
    "feature_names = [feature for feature, _ in sorted_importance_combined]\n",
    "importance_values = [importance for _, importance in sorted_importance_combined]\n",
    "\n",
    "# 绘制横向柱状图\n",
    "plt.figure(figsize=(10, 10))\n",
    "bars=plt.barh(feature_names, importance_values)\n",
    "plt.xlabel('Importance')\n",
    "plt.ylabel('Features')\n",
    "plt.title('Feature Importance')\n",
    "plt.gca().invert_yaxis()  \n",
    "\n",
    "# 添加数值标注\n",
    "for i, bar in enumerate(bars):\n",
    "    plt.text(bar.get_width() + 0.1, bar.get_y() + bar.get_height() / 2, f'{importance_values[i]:.2f}', ha='left', va='center')\n",
    "plt.tight_layout()  \n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导入并初始化shap\n",
    "import shap\n",
    "shap.initjs()\n",
    "\n",
    "# 设置模型的目标\n",
    "best_model_lgb.params['objective'] ='reg:linear'\n",
    "\n",
    "# shap解释器\n",
    "explainer_lgb = shap.TreeExplainer(best_model_lgb)\n",
    "\n",
    "# shap value\n",
    "shap_values_lgb = explainer_lgb(X_train)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "绘制两种summary_plot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(dpi=200)\n",
    "shap.summary_plot(shap_values_lgb, X_train, plot_type=\"bar\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(dpi=200)\n",
    "shap.summary_plot(shap_values_lgb, X_train)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "绘制单个样本每个特征的SHAP value贡献度(force_plot和waterfall)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "shap.force_plot(explainer_lgb.expected_value, shap_values_lgb.values[20000])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(dpi=200)\n",
    "shap.plots.waterfall(shap_values_lgb[20000])"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "降维（特征选择）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 要剔除的特征\n",
    "remove_columns=[\"Time_Lag\",\n",
    "                \"inactive_contributors_Lag\",\n",
    "                \"new_contributors_Lag\",\n",
    "                \"change_requests_accepted_Lag\",\n",
    "                \"issue_age_Lag\"\n",
    "                ]\n",
    "\n",
    "X_train_reduce=X_train.drop(columns=remove_columns)\n",
    "X_test_reduce=X_test.drop(columns=remove_columns)\n",
    "\n",
    "best_model=train_lightgbm(X_train_reduce,X_test_reduce,y_train,y_test,epoch=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 要剔除的特征\n",
    "remove_columns=[\"Time_Lag\",\n",
    "                \"inactive_contributors_Lag\",\n",
    "                \"new_contributors_Lag\",\n",
    "                \"change_requests_accepted_Lag\",\n",
    "                \"issue_age_Lag\",\n",
    "                \"new_contributors\",\n",
    "                \"change_requests_accepted\",\n",
    "                \"code_change_lines_add_Lag\",\n",
    "                \"code_change_lines_remove\"\n",
    "                ]\n",
    "\n",
    "X_train_reduce=X_train.drop(columns=remove_columns)\n",
    "X_test_reduce=X_test.drop(columns=remove_columns)\n",
    "\n",
    "best_model=train_lightgbm(X_train_reduce,X_test_reduce,y_train,y_test,epoch=1000)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "两阶段数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train1,X_test1,y_train1,y_test1=get_train_test_data(data,Stage2,stage=1)\n",
    "X_train2,X_test2,y_train2,y_test2=get_train_test_data(data,Stage2,stage=2)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "两阶段模型训练验证"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "best_model1,cv_results=train_lightgbm(X_train1,X_test1,y_train1,y_test1,epoch=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "best_model2,cv_results=train_lightgbm(X_train2,X_test2,y_train2,y_test2,epoch=1000)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "四阶段数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train1,X_test1,y_train1,y_test1=get_train_test_data(data,Stage4,stage=1)\n",
    "X_train2,X_test2,y_train2,y_test2=get_train_test_data(data,Stage4,stage=2)\n",
    "X_train3,X_test3,y_train3,y_test3=get_train_test_data(data,Stage4,stage=3)\n",
    "X_train4,X_test4,y_train4,y_test4=get_train_test_data(data,Stage4,stage=4)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "四阶段模型训练验证"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "best_model1,cv_results=train_lightgbm(X_train1,X_test1,y_train1,y_test1,epoch=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "best_model2,cv_results=train_lightgbm(X_train2,X_test2,y_train2,y_test2,epoch=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "best_model3,cv_results=train_lightgbm(X_train3,X_test3,y_train3,y_test3,epoch=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "best_model4,cv_results=train_lightgbm(X_train4,X_test4,y_train4,y_test4,epoch=1000)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "test2",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.18"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
